#!/usr/bin/env python
#coding=utf-8
'''
Created on Aug 15, 2010
@author: lxd
从http://www.sosuo.name/index_surname.asp获取人名
'''
import sys
sys.path.append('..')
from common.web import Web
web = Web()

def encodeName(str):
    return repr(str.encode('gbk')).replace(r'\x', '%').upper()

import re
def parseName(f):
    data = f.read()
    reg = r'target="_blank" rel="nofollow">(?P<name>[^<>]*?)</a> </TD>'
    p = re.compile(reg, re.IGNORECASE)
    m = p.findall(data)
    if m:
        names = [i.decode('gbk') for i in m][:20]
        return names
    else:
        return []
    
name_list = []
surnames = [u'李', u'王', u'张', u'刘', u'陈', u'杨', u'赵', u'黄', u'周', u'吴', u'徐', u'孙', u'胡', u'朱', u'高', u'林', u'何', u'郭', u'马', u'罗']
for surname in surnames:
    name = encodeName(surname)
    url = 'http://www.sosuo.name/surname/%s/' % name
    f = web.fetchData('', url)
    name_list += parseName(f)
    
from common.tools import Tools
import os
name_file = os.path.join(Tools.getFatherDir(), 'db', 'name')
Tools.saveData(name_file, name_list) 
    
    
    
    
    
    
    
    
    



